3.1 인과관계에 대해 생각해보기
인과모델은 변수들 사이의 인과 관계를 방향성이 있는 그래프(Directed Acyclic Graph, DAG)로 표현합니다. 노드는 변수를, 화살표는 직접적인 인과 영향을 나타냅니다.
import warnings
warnings.filterwarnings("ignore" )
import pandas as pd
import numpy as np
import graphviz as gr
color = ["0.3" , "0.5" , "0.7" , "0.9" ]
linestyle = ["-" , "--" , ":" , "-." ]
marker = ["o" , "v" , "d" , "p" ]
pd.set_option("display.max_rows" , 6 )
gr.set_default_format("png" );
import pandas as pd
data = pd.read_csv("../data/cross_sell_email.csv" )
data
0
0
short
15
0
1
1
short
27
0
2
1
long
17
0
...
...
...
...
...
320
0
no_email
15
0
321
1
no_email
16
0
322
1
long
24
1
323 rows × 4 columns
3.1.1 인과관계 시각화
인과 그래프를 사용하면 복잡한 변수 간의 관계를 시각적으로 명확하게 파악할 수 있습니다.
import graphviz as gr
g_cross_sell = gr.Digraph()
g_cross_sell.edge("U" , "conversion" )
g_cross_sell.edge("U" , "age" )
g_cross_sell.edge("U" , "gender" )
g_cross_sell.edge("rnd" , "cross_sell_email" )
g_cross_sell.edge("cross_sell_email" , "conversion" )
g_cross_sell.edge("age" , "conversion" )
g_cross_sell.edge("gender" , "conversion" )
g_cross_sell
g_cross_sell = gr.Digraph()
g_cross_sell.edge("U" , "conversion" )
g_cross_sell.edge("U" , "age" )
g_cross_sell.edge("U" , "gender" )
g_cross_sell.edge("rnd" , "cross_sell_email" )
g_cross_sell.edge("cross_sell_email" , "conversion" )
g_cross_sell.edge("age" , "conversion" )
g_cross_sell.edge("gender" , "conversion" )
g_cross_sell
# rankdir:LR layers the graph from left to right
g_cross_sell = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g_cross_sell.edge("U" , "conversion" )
g_cross_sell.edge("U" , "X" )
g_cross_sell.edge("cross_sell_email" , "conversion" )
g_cross_sell.edge("X" , "conversion" )
g_cross_sell
g_cross_sell = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g_cross_sell.edge("U" , "conversion" )
g_cross_sell.edge("U" , "X" )
g_cross_sell.edge("cross_sell_email" , "conversion" )
g_cross_sell.edge("X" , "conversion" )
g_cross_sell
3.1.2 컨설턴트 영입 여부 결정하기
컨설턴트 영입이 이윤에 미치는 영향을 파악하기 위해 그래프로 모델링해 봅니다.
3.2 그래프 모델 집중 훈련
3.2.1 사슬
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("T" , "M" )
g.edge("M" , "Y" )
g.node("M" , "M" )
g.edge("causal knowledge" , "solve problems" )
g.edge("solve problems" , "job promotion" )
g
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("T" , "M" )
g.edge("M" , "Y" )
g.node("M" , "M" )
g.node("M" , color= "lightgrey" , style= "filled" )
g.edge("causal knowledge" , "solve problems" )
g.edge("solve problems" , "job promotion" )
g.node("solve problems" , color= "lightgrey" , style= "filled" )
g
3.2.2 분기
g = gr.Digraph()
g.edge("X" , "Y" )
g.edge("X" , "T" )
g.node("X" , "X" )
g.edge("statistics" , "causal inference" )
g.edge("statistics" , "machine learning" )
g
g = gr.Digraph()
g.edge("good programmer" , "can invert a binary tree" )
g.edge("good programmer" , "good employee" )
g
3.2.3 충돌부
g = gr.Digraph()
g.edge("Y" , "X" )
g.edge("T" , "X" )
g.edge("statistics" , "job promotion" )
g.edge("flatter" , "job promotion" )
g
g = gr.Digraph()
g.edge("Y" , "X1" )
g.edge("T" , "X1" )
g.edge("X1" , "X2" )
g.node("X2" , color= "lightgrey" , style= "filled" )
g.edge("statistics" , "job promotion" )
g.edge("flatter" , "job promotion" )
g.edge("job promotion" , "high salary" )
g.node("high salary" , color= "lightgrey" , style= "filled" )
g
3.2.5 파이썬에서 그래프 쿼리하기
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("C" , "A" )
g.edge("C" , "B" )
g.edge("D" , "A" )
g.edge("B" , "E" )
g.edge("F" , "E" )
g.edge("A" , "G" )
g
import networkx as nx
model = nx.DiGraph(
[
("C" , "A" ),
("C" , "B" ),
("D" , "A" ),
("B" , "E" ),
("F" , "E" ),
("A" , "G" ),
]
)
print ("Are D and C dependent?" )
print (not (nx.d_separated(model, {"D" }, {"C" }, {})))
print ("Are D and C dependent given A?" )
print (not (nx.d_separated(model, {"D" }, {"C" }, {"A" })))
print ("Are D and C dependent given G?" )
print (not (nx.d_separated(model, {"D" }, {"C" }, {"G" })))
Are D and C dependent?
False
Are D and C dependent given A?
True
Are D and C dependent given G?
True
print ("Are G and D dependent?" )
print (not (nx.d_separated(model, {"G" }, {"D" }, {})))
print ("Are G and D dependent given A?" )
print (not (nx.d_separated(model, {"G" }, {"D" }, {"A" })))
Are G and D dependent?
True
Are G and D dependent given A?
False
print ("Are A and B dependent?" )
print (not (nx.d_separated(model, {"A" }, {"B" }, {})))
print ("Are A and B dependent given C?" )
print (not (nx.d_separated(model, {"A" }, {"B" }, {"C" })))
Are A and B dependent?
True
Are A and B dependent given C?
False
print ("Are G and F dependent?" )
print (not (nx.d_separated(model, {"G" }, {"F" }, {})))
print ("Are G and F dependent given E?" )
print (not (nx.d_separated(model, {"G" }, {"F" }, {"E" })))
Are G and F dependent?
False
Are G and F dependent given E?
True
3.3 식별 재해석
consultancy_sev = gr.Digraph(graph_attr= {"rankdir" : "LR" })
consultancy_sev.edge("profits_prev_6m" , "profits_next_6m" )
consultancy_sev.edge("profits_prev_6m" , "consultancy" )
consultancy_sev
consultancy_model_severed = nx.DiGraph(
[
("profits_prev_6m" , "profits_next_6m" ),
("profits_prev_6m" , "consultancy" ),
# ("consultancy", "profits_next_6m"), # causal relationship removed
]
)
not (
nx.d_separated(consultancy_model_severed, {"consultancy" }, {"profits_next_6m" }, {})
)
g_consultancy = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g_consultancy.edge("profits_prev_6m" , "profits_next_6m" )
g_consultancy.edge("profits_prev_6m" , "consultancy" )
g_consultancy.edge("consultancy" , "profits_next_6m" )
g_consultancy.node("profits_prev_6m" , color= "lightgrey" , style= "filled" )
g_consultancy
3.6 구체적인 식별 예제
df = pd.DataFrame(
dict (
profits_prev_6m= [1.0 , 1.0 , 1.0 , 5.0 , 5.0 , 5.0 ],
consultancy= [0 , 0 , 1 , 0 , 1 , 1 ],
profits_next_6m= [1 , 1.1 , 1.2 , 5.5 , 5.7 , 5.7 ],
)
)
df
0
1.0
0
1.0
1
1.0
0
1.1
2
1.0
1
1.2
3
5.0
0
5.5
4
5.0
1
5.7
5
5.0
1
5.7
(
df.query("consultancy==1" )["profits_next_6m" ].mean()
- df.query("consultancy==0" )["profits_next_6m" ].mean()
)
avg_df = df.groupby(["consultancy" , "profits_prev_6m" ])["profits_next_6m" ].mean()
avg_df.loc[1 ] - avg_df.loc[0 ]
profits_prev_6m
1.0 0.15
5.0 0.20
Name: profits_next_6m, dtype: float64
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("U" , "T" )
g.edge("U" , "Y" )
g.edge("T" , "M" )
g.edge("M" , "Y" )
g
3.7 교란편향
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("X" , "T" )
g.edge("X" , "Y" )
g.edge("T" , "Y" )
(g.edge("Manager Quality" , "Training" ),)
(g.edge("Manager Quality" , "Engagement" ),)
g.edge("Training" , "Engagement" )
g
3.7.1 대리 교란 요인
g = gr.Digraph()
g.edge("X1" , "U" )
g.edge("U" , "X2" )
g.edge("U" , "T" )
g.edge("T" , "Y" )
g.edge("U" , "Y" )
g.edge("Manager Quality" , "Team's Attrition" )
g.edge("Manager Quality" , "Team's Past Performance" )
g.edge("Manager's Tenure" , "Manager Quality" )
g.edge("Manager's Education Level" , "Manager Quality" )
g.edge("Manager Quality" , "Training" )
g.edge("Training" , "Engagement" )
g.edge("Manager Quality" , "Engagement" )
g
3.7.2 랜덤화 재해석
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("rnd" , "T" )
g.edge("T" , "Y" )
g.edge("U" , "Y" )
g
3.8 선택편향
3.8.1 충돌부 조건부 설정
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("T" , "S" )
g.edge("T" , "Y" )
g.edge("Y" , "S" )
g.node("S" , color= "lightgrey" , style= "filled" )
(g.edge("RND" , "New Feature" ),)
(g.edge("New Feature" , "Customer Satisfaction" ),)
(g.edge("Customer Satisfaction" , "NPS" ),)
(g.edge("Customer Satisfaction" , "Response" ),)
(g.edge("New Feature" , "Response" ),)
g.node("Response" , "Response" , color= "lightgrey" , style= "filled" )
g
nps_model = nx.DiGraph(
[
("RND" , "New Feature" ),
# ("New Feature", "Customer Satisfaction"),
("Customer Satisfaction" , "NPS" ),
("Customer Satisfaction" , "Response" ),
("New Feature" , "Response" ),
]
)
not (nx.d_separated(nps_model, {"NPS" }, {"New Feature" }, {"Response" }))
np.random.seed(2 )
n = 100000
new_feature = np.random.binomial(1 , 0.5 , n)
satisfaction_0 = np.random.normal(0 , 0.5 , n)
satisfaction_1 = satisfaction_0 + 0.4
satisfaction = new_feature * satisfaction_1 + (1 - new_feature) * satisfaction_0
nps_0 = np.random.normal(satisfaction_0, 1 )
nps_1 = np.random.normal(satisfaction_1, 1 )
nps = new_feature * nps_1 + (1 - new_feature) * nps_0
responded = (np.random.normal(0 + new_feature + satisfaction, 1 ) > 1 ).astype(int )
tr_df = pd.DataFrame(
dict (
new_feature= new_feature, responded= responded, nps_0= nps_0, nps_1= nps_1, nps= nps
)
)
tr_df_measurable = pd.DataFrame(
dict (
new_feature= new_feature,
responded= responded,
nps_0= np.nan,
nps_1= np.nan,
nps= np.where(responded, nps, np.nan),
)
)
tr_df.groupby("new_feature" ).mean()
new_feature
0
0.183715
-0.005047
0.395015
-0.005047
1
0.639342
-0.005239
0.401082
0.401082
tr_df_measurable.groupby("new_feature" ).mean().assign(** {"nps" : np.nan})
new_feature
0
0.183715
NaN
NaN
NaN
1
0.639342
NaN
NaN
NaN
tr_df_measurable.groupby(["responded" , "new_feature" ]).mean()
responded
new_feature
0
0
NaN
NaN
NaN
1
NaN
NaN
NaN
1
0
NaN
NaN
0.314073
1
NaN
NaN
0.536106
tr_df.groupby(["responded" , "new_feature" ]).mean()
responded
new_feature
0
0
-0.076869
0.320616
-0.076869
1
-0.234852
0.161725
0.161725
1
0
0.314073
0.725585
0.314073
1
0.124287
0.536106
0.536106
3.8.2 선택편향 보정
g = gr.Digraph()
g.edge("U" , "X" )
g.edge("X" , "S" )
g.edge("U" , "Y" )
g.edge("T" , "Y" )
g.edge("T" , "S" )
g.node("S" , color= "lightgrey" , style= "filled" )
(g.edge("New Feature" , "Customer Satisfaction" ),)
(g.edge("Unknown Stuff" , "Customer Satisfaction" ),)
(g.edge("Unknown Stuff" , "Time in App" ),)
(g.edge("Time in App" , "Response" ),)
(g.edge("New Feature" , "Response" ),)
g.node("Response" , "Response" , color= "lightgrey" , style= "filled" )
g
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("X1" , "U" )
g.edge("U" , "X2" )
g.edge("X5" , "S" )
g.edge("U" , "Y" , style= "dashed" )
g.edge("U" , "S" , style= "dashed" )
g.edge("U" , "X3" )
g.edge("X3" , "S" )
g.edge("Y" , "X4" )
g.edge("X4" , "S" )
g.edge("T" , "X5" )
g.edge("T" , "Y" )
g.edge("T" , "S" , style= "dashed" )
g.node("S" , color= "lightgrey" , style= "filled" )
g
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("Y" , "X" )
g.edge("T" , "X" )
g.edge("T" , "Y" )
g;
3.8.3 매개자 조건부 설정
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("T" , "M" )
g.edge("T" , "Y" )
g.edge("M" , "Y" )
g.node("M" , color= "lightgrey" , style= "filled" )
g.edge("woman" , "seniority" )
g.edge("woman" , "salary" )
g.edge("seniority" , "salary" )
g.node("seniority" , color= "lightgrey" , style= "filled" )
g
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("T" , "M" )
g.edge("T" , "Y" )
g.edge("M" , "Y" )
g.edge("M" , "X" )
g.node("X" , color= "lightgrey" , style= "filled" )
g
3.9 요약
g = gr.Digraph(graph_attr= {"rankdir" : "LR" , "ratio" : "0.3" })
g.edge("U" , "T" )
g.edge("U" , "Y" )
g.edge("T" , "Y" )
g
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("T" , "M" )
g.edge("M" , "Y" )
g.edge("T" , "Y" )
g.edge("T" , "S" )
g.edge("Y" , "S" )
g.node("M" , color= "lightgrey" , style= "filled" )
g.node("S" , color= "lightgrey" , style= "filled" )
g
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("T" , "In-Game Purchase" )
g.edge("T" , "In-Game Purchase > 0" )
g.edge("In-Game Purchase" , "In-Game Purchase > 0" )
g.node("In-Game Purchase > 0" , color= "lightgrey" , style= "filled" )
g
g = gr.Digraph(graph_attr= {"rankdir" : "LR" })
g.edge("loan amount" , "Default at yr=1" )
g.edge("Default at yr=1" , "Default at yr=2" )
g.edge("Default at yr=2" , "Default at yr=3" )
g.edge("U" , "Default at yr=1" )
g.edge("U" , "Default at yr=2" )
g.edge("U" , "Default at yr=3" )
g.node("Default at yr=1" , color= "lightgrey" , style= "filled" )
g.node("Default at yr=2" , color= "darkgrey" , style= "filled" )
g